This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
#import libraries
library(plyr)
library(tidyverse)
[30m── [1mAttaching packages[22m ──────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.2.0 [32m✔[30m [34mpurrr [30m 0.3.2
[32m✔[30m [34mtibble [30m 2.1.1 [32m✔[30m [34mdplyr [30m 0.8.3
[32m✔[30m [34mtidyr [30m 0.8.3 [32m✔[30m [34mstringr[30m 1.4.0
[32m✔[30m [34mreadr [30m 1.3.1 [32m✔[30m [34mforcats[30m 0.4.0[39m
[30m── [1mConflicts[22m ─────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32marrange()[30m masks [34mplyr[30m::arrange()
[31m✖[30m [34mpurrr[30m::[32mcompact()[30m masks [34mplyr[30m::compact()
[31m✖[30m [34mdplyr[30m::[32mcount()[30m masks [34mplyr[30m::count()
[31m✖[30m [34mdplyr[30m::[32mfailwith()[30m masks [34mplyr[30m::failwith()
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mid()[30m masks [34mplyr[30m::id()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31m✖[30m [34mdplyr[30m::[32mmutate()[30m masks [34mplyr[30m::mutate()
[31m✖[30m [34mdplyr[30m::[32mrename()[30m masks [34mplyr[30m::rename()
[31m✖[30m [34mdplyr[30m::[32msummarise()[30m masks [34mplyr[30m::summarise()
[31m✖[30m [34mdplyr[30m::[32msummarize()[30m masks [34mplyr[30m::summarize()[39m
library(dplyr)
library(corrplot)
corrplot 0.84 loaded
library(ggplot2)
library(ggcorrplot)
library(data.table)
data.table 1.12.2 using 2 threads (see ?getDTthreads). Latest news: r-datatable.com
Attaching package: ‘data.table’
The following objects are masked from ‘package:dplyr’:
between, first, last
The following object is masked from ‘package:purrr’:
transpose
#Import dataset diemthi2019.csv
diemthi2019 <- read.csv('~/Desktop/2019 vietnam national high school exam/diemthi2019.csv', row.names = NULL)
#Choose necessary columns and reorder them
head(diemthi2019)
str(diemthi2019)
'data.frame': 811851 obs. of 12 variables:
$ X : int 0 1 2 3 4 5 6 7 8 9 ...
$ Dia : num 7.75 NA 4.25 4.75 8.25 NA NA 6.25 NA 7.5 ...
$ GDCD : num 8.75 NA 5.75 5 7.5 NA NA 8.75 NA 8.5 ...
$ Hoa : num NA 4.5 NA NA NA 5.75 NA NA 3.5 NA ...
$ Li : num NA 8.25 NA NA NA 7.5 NA NA 5.5 NA ...
$ Ma_mon_ngoai_ngu: Factor w/ 7 levels "","N1","N2","N3",..: 2 2 1 1 2 2 1 2 2 2 ...
$ Ngoai_ngu : num 7.6 8 NA NA 4.4 4.4 NA 8.8 2.2 5.2 ...
$ Sinh : num NA 6 NA NA NA 3 NA NA 3.75 NA ...
$ Su : num 8.25 NA 4 2.25 7.75 NA 3.25 5.25 NA 3.25 ...
$ Toan : num 8.2 8.6 5.2 4.4 5.8 7.2 2.2 5 5.6 6.6 ...
$ Van : num 8 6.17 4.75 4.5 6 6.75 5.75 6.5 4.5 7 ...
$ sbd : int 24008611 51000032 51000005 51000021 51000013 51000003 51000001 51000011 51000007 51000010 ...
#Import and modify dataset province_code.csv
province_code <- read.csv('~/Desktop/2019 vietnam national high school exam/province_code.csv', sep = ";", row.names = NULL)
#Change column type of province_code.csv
province_code$Province_code <- as.numeric(province_code$Province_code)
province_code$Province_name <- as.factor(province_code$Province_name)
head(province_code)
str(province_code)
'data.frame': 64 obs. of 2 variables:
$ Province_code: num 1 2 3 4 5 6 7 8 9 10 ...
$ Province_name: Factor w/ 63 levels "","An Giang",..: 24 29 27 16 22 15 35 38 60 37 ...
#Change column name for diemthi2019.csv
diemthi2019 <- plyr::rename(diemthi2019, c(
'sbd' = 'Student_index',
'Toan'= 'Maths',
'Van' = 'Literature',
'Ngoai_ngu' = 'Foreign_language',
'Ma_mon_ngoai_ngu' = 'Foreign_language_code',
'Li' = 'Physics',
'Hoa' = 'Chemistry',
'Sinh' = 'Biology',
'Su' = 'History',
'Dia' = 'Geography',
'GDCD' = 'Citizenship_education'
)
)
head(diemthi2019)
#Insert province_code for diemthi2019.csv
Province_code <- ifelse(diemthi2019$Student_index > 10000000,
substr(as.character(diemthi2019$Student_index), start = 1, stop = 2),
substr(as.character(diemthi2019$Student_index), start = 1, stop = 1)
)
diemthi2019$Province_code <- as.numeric(Province_code)
#insert foreign language names with respect to code
diemthi2019$Foreign_language_name <- diemthi2019$Foreign_language_code
diemthi2019$Foreign_language_name <- mapvalues(diemthi2019$Foreign_language_code, from=c("N1", "N2", "N3", "N4", "N5", "N6"), to=c("English", "Russian", "French", "Chinese", "German", "Japanese"))
head(diemthi2019)
#Merge diemthi2019.csv and province_code.csv
diemthi2019 <- join(diemthi2019, province_code, by = "Province_code")
head(diemthi2019)
#Calculate scores of combinations A00, A01, B00, C00, D00
diemthi2019$A00 <- diemthi2019$Maths + diemthi2019$Physics + diemthi2019$Chemistry
if(diemthi2019$Foreign_language_code == 'N1') {
diemthi2019$A01 <- diemthi2019$Maths + diemthi2019$Physics + diemthi2019$Foreign_language
}
the condition has length > 1 and only the first element will be used
diemthi2019$B00 <- diemthi2019$Maths + diemthi2019$Chemistry + diemthi2019$Biology
diemthi2019$C00 <- diemthi2019$Literature + diemthi2019$History + diemthi2019$Geography
if(diemthi2019$Foreign_language_code == 'N1') {
diemthi2019$D00 <- diemthi2019$Maths + diemthi2019$Literature + diemthi2019$Foreign_language
}
the condition has length > 1 and only the first element will be used
head(diemthi2019)
#Calculate average of natural science combination and social science combination
diemthi2019$Avg_natural_sciences <- (diemthi2019$Physics + diemthi2019$Chemistry + diemthi2019$Biology)/3
diemthi2019$Avg_social_sciences <- (diemthi2019$History + diemthi2019$Geography + diemthi2019$Citizenship_education)/3
#Reorder and choose necessary columns
diemthi2019 <- diemthi2019[c('Student_index', 'Province_code', 'Province_name', 'Maths', 'Literature', 'Foreign_language' ,'Foreign_language_code', 'Foreign_language_name', 'Physics', 'Chemistry', 'Biology', 'Avg_natural_sciences', 'History', 'Geography', 'Citizenship_education', 'Avg_social_sciences', 'A00', 'A01', 'B00', 'C00', 'D00')]
head(diemthi2019)
#numerical df for correlation matrix
diemthi2019_corr <- diemthi2019[c(4:6,9:21)]
#correlation matrix
diemthi2019_corr_matrix <- cor(diemthi2019_corr, use="pairwise.complete.obs")
View(diemthi2019_corr_matrix)
#correlation heatmap
ggcorrplot(diemthi2019_corr_matrix, hc.order = TRUE,
lab = TRUE)
#Number of students who takes each foreign language test
table(diemthi2019$Foreign_language_name)
English Russian French Chinese German Japanese
83338 726064 148 646 801 81 773
#Number of students who took each subject
took_subject <- list()
for(i in c(4:6, 9:11, 13:15)) {
print(paste('The number of students who took the subject', colnames(diemthi2019[i]), 'is', sum(!is.na(diemthi2019[,i]) == TRUE)))
}
[1] "The number of students who took the subject Maths is 807762"
[1] "The number of students who took the subject Literature is 799208"
[1] "The number of students who took the subject Foreign_language is 728513"
[1] "The number of students who took the subject Physics is 292166"
[1] "The number of students who took the subject Chemistry is 295544"
[1] "The number of students who took the subject Biology is 291199"
[1] "The number of students who took the subject History is 542729"
[1] "The number of students who took the subject Geography is 535897"
[1] "The number of students who took the subject Citizenship_education is 470905"
#Number of students who are eligible to be considered for admission with each combination A00, A01, B00, C00, D00
combination_eligible <- list()
for(i in c(17:21)) {
print(paste('The number of students who are eligible to be considered for combination', colnames(diemthi2019[i]), 'is', sum(!is.na(diemthi2019[,i]) == TRUE)))
combination_eligible[[i-16]] = table(!is.na(diemthi2019[,i]), dnn = paste(colnames(diemthi2019)[i]))
}
[1] "The number of students who are eligible to be considered for combination A00 is 290759"
[1] "The number of students who are eligible to be considered for combination A01 is 281100"
[1] "The number of students who are eligible to be considered for combination B00 is 291078"
[1] "The number of students who are eligible to be considered for combination C00 is 535427"
[1] "The number of students who are eligible to be considered for combination D00 is 725824"
combination_eligible
[[1]]
A00
FALSE TRUE
521092 290759
[[2]]
A01
FALSE TRUE
530751 281100
[[3]]
B00
FALSE TRUE
520773 291078
[[4]]
C00
FALSE TRUE
276424 535427
[[5]]
D00
FALSE TRUE
86027 725824
#Number of students who took all natural science subjects, all social science subjects, or all of both
print(paste('The number of students who took all natural science subjects is', sum(!is.na(diemthi2019[,13]) == TRUE)))
[1] "The number of students who took all natural science subjects is 542729"
print(paste('The number of students who took all social science subjects is', sum(!is.na(diemthi2019[,17]) == TRUE)))
[1] "The number of students who took all social science subjects is 290759"
print(paste('The number of students who took all natural and social science subjects is', sum(!is.na(diemthi2019[,13]) == TRUE & !is.na(diemthi2019[,17]) == TRUE)))
[1] "The number of students who took all natural and social science subjects is 30746"
#Find the maximum score for each column
for(i in c(4:6,9:21)) {
print(paste('The maximum score of',colnames(diemthi2019)[i],'is', max(diemthi2019[,i], na.rm = TRUE)))
}
[1] "The maximum score of Maths is 10"
[1] "The maximum score of Literature is 9.5"
[1] "The maximum score of Foreign_language is 10"
[1] "The maximum score of Physics is 10"
[1] "The maximum score of Chemistry is 10"
[1] "The maximum score of Biology is 10"
[1] "The maximum score of Avg_natural_sciences is 9.41666666666667"
[1] "The maximum score of History is 10"
[1] "The maximum score of Geography is 10"
[1] "The maximum score of Citizenship_education is 10"
[1] "The maximum score of Avg_social_sciences is 9.83333333333333"
[1] "The maximum score of A00 is 29.05"
[1] "The maximum score of A01 is 28.9"
[1] "The maximum score of B00 is 29.8"
[1] "The maximum score of C00 is 28.75"
[1] "The maximum score of D00 is 28.4"
histogram_out <- list()
for(i in c(4:6,9:21)) {
x = diemthi2019[,i]
histogram_out[[i]] <- ggplot(data.frame(x), aes(x)) +
geom_histogram(binwidth = 0.1, color="black", fill="blue") +
labs(title=paste("Score histogram for",colnames(diemthi2019)[i]), x=colnames(diemthi2019)[i], y="Count") +
geom_vline(data = diemthi2019, xintercept = mean(diemthi2019[,i], na.rm = TRUE), color = "red", linetype = "dashed", size = 1)
}
histogram_out
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
[[5]]
[[6]]
[[7]]
NULL
[[8]]
NULL
[[9]]
[[10]]
[[11]]
[[12]]
[[13]]
[[14]]
[[15]]
[[16]]
[[17]]
[[18]]
[[19]]
[[20]]
[[21]]
histogram_out <- list()
for(i in c(4:6,9:21)) {
x = diemthi2019[,i]
histogram_out[[i]] <- ggplot(data.frame(x), aes(x)) +
geom_histogram(binwidth = 0.1, color="black", fill="blue") +
labs(title=paste("Score histogram for",colnames(diemthi2019)[i]), x=colnames(diemthi2019)[i], y="Count") +
geom_vline(data = diemthi2019, xintercept = mean(diemthi2019[,i], na.rm = TRUE), color = "red", linetype = "dashed", size = 1)
print(histogram_out[[i]])
print(paste('The mean score of', colnames(diemthi2019)[i], 'is', mean(diemthi2019[,i])))
}
[1] "The mean score of Maths is NA"
[1] "The mean score of Literature is NA"
[1] "The mean score of Foreign_language is NA"
[1] "The mean score of Physics is NA"
[1] "The mean score of Chemistry is NA"
[1] "The mean score of Biology is NA"
[1] "The mean score of Avg_natural_sciences is NA"
[1] "The mean score of History is NA"
[1] "The mean score of Geography is NA"
[1] "The mean score of Citizenship_education is NA"
[1] "The mean score of Avg_social_sciences is NA"
[1] "The mean score of A00 is NA"
[1] "The mean score of A01 is NA"
[1] "The mean score of B00 is NA"
[1] "The mean score of C00 is NA"
[1] "The mean score of D00 is NA"
NA
NA
summary(diemthi2019[,3])
An Giang Ba Ria - Vung Tau Bac Giang Bac Kan Bac Lieu Bac Ninh
0 15784 11221 19554 2849 5355 14276
Ben Tre Binh Dinh Binh Duong Binh Phuoc Binh Thuan Ca Mau Can Tho
11316 17773 10854 9705 10885 9292 10471
Cao Bang Da Nang Dak Lak Dak Nong Dien Bien Dong Nai Dong Thap
4741 23290 20471 6292 5287 27525 13098
Ha Giang Ha Nam Ha Noi Ha Tinh Hai Duong Hai Phong Hau Giang
5155 8583 74277 16821 19248 18484 5707
Ho Chi Minh Hoa Binh Hung Yen Khanh Hoa Kien Giang Kon Tum Lai Chau
0 8949 12356 12546 12468 4305 3231
Lam Dong Lang Son Lao Cao Long An Nam Dinh Nghe An Ninh Binh
13888 8830 6230 13350 18275 32242 8948
Ninh Thuan Phu Tho Phu Yen Quang Binh Quang Nam Quang Ngai Quang Ninh
5448 13635 10311 10385 16346 12690 14108
Quang Tri Soc Trang Son La Tay Ninh Thai Binh Thai Nguyen Thanh Hoa
7873 8759 10551 8722 18986 14348 34886
Thua Thien Hue Tien Giang Tra Vinh Tuyen Quang Vinh Long Vinh Phuc Yen Bai
11881 13744 7840 7867 10274 12241 7024
province_frequency <- table(diemthi2019[,3])
#sort(province_frequency, decreasing = FALSE)
province_frequency <-count(diemthi2019, Province_name)
names(province_frequency) <- c('Province', 'Frequency')
province_frequency <- province_frequency[order(-province_frequency$Frequency),]
par(las=2)
View(province_frequency)
ggplot(data=province_frequency, aes(x=reorder(Province,Frequency),y=Frequency)) +
geom_bar(position="dodge", stat="identity") +
coord_flip() +
ggtitle(paste('Barplot of number of students by province')) +
labs(x = "Province", y = "Number of students") +
geom_text(
aes(label = Frequency), colour = "red",
hjust = -0.5, size = 3,
position = position_dodge(width = 1),
inherit.aes = TRUE
)
barplot_mean <- list()
for(i in c(4:6,9:21)) {
x <- aggregate(diemthi2019[,i] ~ diemthi2019$Province_name, diemthi2019, mean)
names(x) <- c('Province', 'Mean')
x <- x[order(x$Mean),]
barplot_mean[[i]] <- ggplot(data=x, aes(x=reorder(Province,Mean),y=Mean)) +
geom_bar(position="dodge",stat="identity") +
coord_flip() +
ggtitle(paste('Barplot for mean of', colnames(diemthi2019)[i], 'by province')) +
labs(x = "Province", y = "Mean score") +
geom_text(
aes(label = Mean), colour = "red",
hjust = -0.5, size = 3,
position = position_dodge(width = 1),
inherit.aes = TRUE
)
}
barplot_mean
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
[[5]]
[[6]]
[[7]]
NULL
[[8]]
NULL
[[9]]
[[10]]
[[11]]
[[12]]
[[13]]
[[14]]
[[15]]
[[16]]
[[17]]
[[18]]
[[19]]
[[20]]
[[21]]